{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from utils import *\n",
    "import tensorflow as tf\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import time\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['negative', 'positive']\n",
      "10662\n",
      "10662\n"
     ]
    }
   ],
   "source": [
    "trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset.data, trainset.target = separate_dataset(trainset,1.0)\n",
    "print (trainset.target_names)\n",
    "print (len(trainset.data))\n",
    "print (len(trainset.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(trainset.data, trainset.target,\n",
    "                                                    test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocab from size: 20465\n",
      "Most common words [('the', 10129), ('a', 7312), ('and', 6199), ('of', 6063), ('to', 4233), ('is', 3378)]\n",
      "Sample data [4, 663, 9, 2542, 8, 22, 4, 3378, 17841, 97] ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'centurys', 'new']\n"
     ]
    }
   ],
   "source": [
    "concat = ' '.join(trainset.data).split()\n",
    "vocabulary_size = len(list(set(concat)))\n",
    "data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)\n",
    "print('vocab from size: %d'%(vocabulary_size))\n",
    "print('Most common words', count[4:10])\n",
    "print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "GO = dictionary['GO']\n",
    "PAD = dictionary['PAD']\n",
    "EOS = dictionary['EOS']\n",
    "UNK = dictionary['UNK']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Model:\n",
    "    def __init__(self, size_layer, num_layers, embedded_size,\n",
    "                 dict_size, dimension_output,margin=0.2):\n",
    "        \n",
    "        def cells(reuse=False):\n",
    "            return tf.nn.rnn_cell.BasicRNNCell(size_layer,reuse=reuse)\n",
    "        \n",
    "        def rnn(embedded,reuse=False):\n",
    "            with tf.variable_scope('model', reuse=reuse):\n",
    "                rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])\n",
    "                outputs, _ = tf.nn.dynamic_rnn(rnn_cells, embedded, dtype = tf.float32)\n",
    "                W = tf.get_variable('w',shape=(size_layer, dimension_output),initializer=tf.orthogonal_initializer())\n",
    "                b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())\n",
    "                return tf.matmul(outputs[:, -1], W) + b\n",
    "            \n",
    "        with tf.device('/cpu:0'):    \n",
    "            self.INPUT_1 = tf.placeholder(tf.int32, [None, None])\n",
    "            self.INPUT_2 = tf.placeholder(tf.int32, [None, None])\n",
    "            self.Y = tf.placeholder(tf.float32, [None, 1])\n",
    "            encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))\n",
    "            input1_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.INPUT_1)\n",
    "            input2_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.INPUT_2)\n",
    "            self.logits_1 = rnn(input1_embedded,False)\n",
    "            self.logits_2 = rnn(input2_embedded,True)\n",
    "            d = tf.sqrt(tf.reduce_sum(tf.pow(self.logits_1-self.logits_2, 2), 1, keep_dims=True))\n",
    "            tmp = self.Y * tf.square(d)    \n",
    "            tmp2 = (1 - self.Y) * tf.square(tf.maximum((margin - d),0))\n",
    "            self.cost = tf.reduce_mean(tmp + tmp2) /2\n",
    "            self.optimizer = tf.train.MomentumOptimizer(0.01, 0.99, use_nesterov=True).minimize(self.cost)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "size_layer = 128\n",
    "num_layers = 2\n",
    "embedded_size = 128\n",
    "dimension_output = 32\n",
    "maxlen = 50\n",
    "batch_size = 128"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = Model(size_layer,num_layers,embedded_size,vocabulary_size+4,dimension_output)\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = list(zip(train_X, train_Y))\n",
    "random.shuffle(c)\n",
    "train_X_1, train_Y_1 = zip(*c)\n",
    "\n",
    "c = list(zip(train_X, train_Y))\n",
    "random.shuffle(c)\n",
    "train_X_2, train_Y_2 = zip(*c)\n",
    "\n",
    "label_shuffle = np.expand_dims((np.array(train_Y_1) == np.array(train_Y_2)).astype('int'),1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time taken: 7.159314870834351\n",
      "epoch: 0, training loss: 0.371119\n",
      "\n",
      "time taken: 7.075716257095337\n",
      "epoch: 1, training loss: 0.005651\n",
      "\n",
      "time taken: 7.074535369873047\n",
      "epoch: 2, training loss: 0.003678\n",
      "\n",
      "time taken: 7.048555135726929\n",
      "epoch: 3, training loss: 0.002070\n",
      "\n",
      "time taken: 7.1200714111328125\n",
      "epoch: 4, training loss: 0.000913\n",
      "\n",
      "time taken: 7.097279787063599\n",
      "epoch: 5, training loss: 0.000275\n",
      "\n",
      "time taken: 7.189687728881836\n",
      "epoch: 6, training loss: 0.000149\n",
      "\n",
      "time taken: 7.128118515014648\n",
      "epoch: 7, training loss: 0.000183\n",
      "\n",
      "time taken: 7.178845405578613\n",
      "epoch: 8, training loss: 0.000073\n",
      "\n",
      "time taken: 7.181195974349976\n",
      "epoch: 9, training loss: 0.000084\n",
      "\n",
      "time taken: 7.181663990020752\n",
      "epoch: 10, training loss: 0.000042\n",
      "\n",
      "time taken: 7.173894882202148\n",
      "epoch: 11, training loss: 0.000089\n",
      "\n",
      "time taken: 7.198666095733643\n",
      "epoch: 12, training loss: 0.000019\n",
      "\n",
      "time taken: 7.187653064727783\n",
      "epoch: 13, training loss: 0.000041\n",
      "\n",
      "time taken: 7.254589319229126\n",
      "epoch: 14, training loss: 0.000016\n",
      "\n",
      "time taken: 7.234525918960571\n",
      "epoch: 15, training loss: 0.000020\n",
      "\n",
      "time taken: 7.272420883178711\n",
      "epoch: 16, training loss: 0.000047\n",
      "\n",
      "time taken: 7.242069721221924\n",
      "epoch: 17, training loss: 0.000019\n",
      "\n",
      "time taken: 7.216881275177002\n",
      "epoch: 18, training loss: 0.000018\n",
      "\n",
      "time taken: 7.233326196670532\n",
      "epoch: 19, training loss: 0.000023\n",
      "\n",
      "time taken: 7.3419716358184814\n",
      "epoch: 20, training loss: 0.000152\n",
      "\n",
      "time taken: 7.247962951660156\n",
      "epoch: 21, training loss: 0.000361\n",
      "\n",
      "time taken: 7.204127550125122\n",
      "epoch: 22, training loss: 0.000110\n",
      "\n",
      "time taken: 7.303826570510864\n",
      "epoch: 23, training loss: 0.000189\n",
      "\n",
      "time taken: 7.336746692657471\n",
      "epoch: 24, training loss: 0.000092\n",
      "\n",
      "time taken: 7.272712707519531\n",
      "epoch: 25, training loss: 0.000052\n",
      "\n",
      "time taken: 7.328545331954956\n",
      "epoch: 26, training loss: 0.000023\n",
      "\n",
      "time taken: 7.156804084777832\n",
      "epoch: 27, training loss: 0.000019\n",
      "\n",
      "time taken: 7.222546577453613\n",
      "epoch: 28, training loss: 0.000051\n",
      "\n",
      "time taken: 7.252626657485962\n",
      "epoch: 29, training loss: 0.000023\n",
      "\n",
      "time taken: 7.3687944412231445\n",
      "epoch: 30, training loss: 0.000084\n",
      "\n",
      "time taken: 7.205474615097046\n",
      "epoch: 31, training loss: 0.000109\n",
      "\n",
      "time taken: 7.292163133621216\n",
      "epoch: 32, training loss: 0.000102\n",
      "\n",
      "time taken: 7.217748641967773\n",
      "epoch: 33, training loss: 0.000041\n",
      "\n",
      "time taken: 7.308082580566406\n",
      "epoch: 34, training loss: 0.000016\n",
      "\n",
      "time taken: 7.210784912109375\n",
      "epoch: 35, training loss: 0.000069\n",
      "\n",
      "time taken: 7.206620693206787\n",
      "epoch: 36, training loss: 0.000019\n",
      "\n",
      "time taken: 7.314574956893921\n",
      "epoch: 37, training loss: 0.000008\n",
      "\n",
      "time taken: 7.243310928344727\n",
      "epoch: 38, training loss: 0.000010\n",
      "\n",
      "time taken: 7.3154518604278564\n",
      "epoch: 39, training loss: 0.000038\n",
      "\n",
      "time taken: 7.164898872375488\n",
      "epoch: 40, training loss: 0.000040\n",
      "\n",
      "time taken: 7.209126234054565\n",
      "epoch: 41, training loss: 0.000066\n",
      "\n",
      "time taken: 7.138073921203613\n",
      "epoch: 42, training loss: 0.000025\n",
      "\n",
      "time taken: 7.223880767822266\n",
      "epoch: 43, training loss: 0.000138\n",
      "\n",
      "time taken: 7.107582330703735\n",
      "epoch: 44, training loss: 0.000226\n",
      "\n",
      "time taken: 7.197932958602905\n",
      "epoch: 45, training loss: 0.000137\n",
      "\n",
      "time taken: 7.154123783111572\n",
      "epoch: 46, training loss: 0.000055\n",
      "\n",
      "time taken: 7.160715103149414\n",
      "epoch: 47, training loss: 0.000101\n",
      "\n",
      "time taken: 7.12724232673645\n",
      "epoch: 48, training loss: 0.000039\n",
      "\n",
      "time taken: 7.135353326797485\n",
      "epoch: 49, training loss: 0.000095\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for i in range(50):\n",
    "    total_loss = 0\n",
    "    lasttime = time.time()\n",
    "    for k in range(0, (len(train_X) // batch_size) * batch_size, batch_size):\n",
    "        batch_x_1 = str_idx(train_X_1[i:i+batch_size],dictionary,maxlen)\n",
    "        batch_x_2 = str_idx(train_X_2[i:i+batch_size],dictionary,maxlen)\n",
    "        batch_y = label_shuffle[i:i+batch_size]\n",
    "        loss, _ = sess.run([model.cost,model.optimizer],feed_dict={model.INPUT_1:batch_x_1,\n",
    "                                                                 model.INPUT_2:batch_x_2,\n",
    "                                                                 model.Y:batch_y})\n",
    "        total_loss += loss\n",
    "    total_loss /= (len(train_X) // batch_size)\n",
    "    print('time taken:', time.time()-lasttime)\n",
    "    print('epoch: %d, training loss: %f\\n'%(i,total_loss))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.spatial.distance import cdist\n",
    "\n",
    "batch_x = str_idx(train_X_1,dictionary,maxlen)\n",
    "batch_y = str_idx(test_X, dictionary,maxlen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "logits_train = sess.run(model.logits_1,feed_dict={model.INPUT_1:batch_x})\n",
    "logits_test = sess.run(model.logits_1,feed_dict={model.INPUT_1:batch_y})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_test = []\n",
    "for i in range(logits_test.shape[0]):\n",
    "    label_test.append(train_Y_1[np.argsort(cdist(logits_train, [logits_test[i,:]], 'cosine').ravel())[0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "   negative       0.52      0.51      0.52      1068\n",
      "   positive       0.52      0.52      0.52      1065\n",
      "\n",
      "avg / total       0.52      0.52      0.52      2133\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(metrics.classification_report(test_Y, label_test, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
